import urllib.request
import re
import os
from lxml import etree

def getHtmlCode(url):
    headers = {
        'User-Agent': 'Mozilla/5.0(linux; android 6.0; Nexus 5 Build/MRA58N) \
     AppleWebKit/537.36(KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36'
    }
    # 将headers头部添加到url，模拟浏览器访问
    url = urllib.request.Request(url, headers=headers)

    # 将url页面的源代码保存成字符串
    page = urllib.request.urlopen(url).read()
    # 字符串转码
    page = page.decode('UTF-8')
    return page


# 该方法传入html的源代码，通过截取其中的img标签，将图片保存到本机
def getImage(page):
    imageList = re.findall(r'(https:[^\s]*?(jpg|png|gif))"', page)
    x = 0
    file_path = r'./image'
    if not os.path.exists(file_path):
        os.mkdir(file_path) 
    for imageUrl in imageList:
        try:
            x = x + 1
            print('正在下载: 弥豆子'+ str(x) + imageUrl[1])
            # 这个image文件夹需要先创建好才能看到结果
            image_save_path = file_path+'/弥豆子%d.png' % x
            # 下载图片并且保存到指定文件夹中
            urllib.request.urlretrieve(imageUrl[0], image_save_path)
            print('下载成功: 弥豆子'+ str(x) + imageUrl[1])
        except:
            print('下载失败: 弥豆子'+ str(x) + imageUrl[1])
            continue
    pass

def getContent(pageContent):
    ele = etree.HTML(pageContent)
    txt = ele.xpath('//span[@class="bjh-p"]/text()')
    # 写入
    fo = open(r'.\\content.txt',"w",encoding="utf-8")
    num = 0
    for i in txt:
        num = num + 1
        print(str(num)+'➔'+i)
        fo.write(str(num)+'➔'+i+'\n')
    fo.close()

if __name__ == '__main__':
    ### 爬取图片
    # 地址
    pageImg = getHtmlCode("https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=%E7%A5%A2%E8%B1%86%E5%AD%90%E5%9B%BE%E7%89%87&fenlei=256&rsv_pq=0xcd7be4bc00003f7c&rsv_t=cde8TbU0FFgCl50SY1LOutSSmpMx5Tq5MoWIpS9eWUXXxgej0DGIXOtdw8F7&rqlang=en&rsv_enter=1&rsv_dl=tb&rsv_sug3=5&rsv_sug2=0&rsv_btype=i&inputT=158&rsv_sug4=158&rsv_jmp=fail")
    getImage(pageImg)

    ### 爬取文字
    # 地址
    pageContent = getHtmlCode("https://baijiahao.baidu.com/s?id=1704818811781878826&wfr=spider&for=pc")
    getContent(pageContent)